##### NIGERIA DATA CLEANING CODE #####

# Article: Do Online Newspapers Promote or Undermine Nation-Building in Divided Societies? Evidence From Africa
# Authors: Evan Lieberman and Andrew Miller
# Year: June 2020

### Load Packages
library(stm)
library(stringr)
library(plyr)
library(SnowballC)
library(ggplot2)
library(ggthemes)
library(stargazer)
library(xtable)
library(foreign)

### Set Working Directory 
setwd("~")

### Set Seed
set.seed(2001)

#### + Article Data ####

#### ++ Article Pre-processing ####

# Laoding Raw Scraped Article Data 
art <- read.csv("./vangaurd_2016_articles_RAW.csv",
                stringsAsFactors = FALSE,
                row.names=NULL,
                fileEncoding="latin1"); names(art)

# removing excess columns from raw data
art <- art[,c(2:8)]
# THREAD_URL: removing columns with bad URL
art <- art[grep("http",art$THREAD_URL),]
# ARTCLE_TEXT: removing articles with less than 200 characters of text
art <- art[which(nchar(art$ARTICLE_TEXT)>=200),]
# ARTICLE_TITLE: removing articles titles with less than 10 characters 
art <- art[which(nchar(art$ARTICLE_TITLE)>=10),]
# ARTICLE_PUBLISHER: removing articles that don't have Vanguard in publisher columns 
art <- art[grep("Vanguard News", art$ARTICLE_PUBLISHER),]
# ARTICLE DATE: extracting date from date time
art$ARTICLE_DATE <- as.Date(substr(art$ARTICLE_DATETIME,1,10))
# remove "Vanguard News from article titles
art$ARTICLE_TITLE <- gsub(" - Vanguard News","",art$ARTICLE_TITLE)

#### ++ Article Title Variables ####

### coding article titles and creating vectors for coding words
word_bank <- c()
# economic titles
words <- paste(c("unemploy","job","recess"), collapse = "|")
word_bank <- rbind(word_bank,words)
art$title_unemploy <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
words <- paste(c(" oil","petrol","fuel"), collapse = "|")
word_bank <- rbind(word_bank,words)
art$title_oil <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
words <- paste(c("prices","cost","naira","depreciate","inflation","forex","foreign reserves"), collapse = "|")
word_bank <- rbind(word_bank,words)
art$title_naira <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
# political titles
words <- paste(c("buhari","presiden"), collapse = "|")
word_bank <- rbind(word_bank,words)
art$title_buhari <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
words <- paste(c("saraki","senate"), collapse = "|")
word_bank <- rbind(word_bank,words)
art$title_saraki <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
words <- paste(c("pdp","obisanjo","jonathan","fayose"), collapse = "|")
word_bank <- rbind(word_bank,words)
art$title_pdp <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
words <- paste(c("apc","tinubu"), collapse = "|")
word_bank <- rbind(word_bank,words)
art$title_apc <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
words <- paste(c("budget", "econ", "tax"), collapse = "|") 
word_bank <- rbind(word_bank,words)
art$title_federal <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
# scapegoating titles
words <- paste(c("Alleg","Corrupt","dubai","efcc","dss","embezzle","assets","bribe","dasuki"), collapse = "|")
word_bank <- rbind(word_bank,words)
art$title_corrupt <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
words <- paste(c("crime","pollice","arrest","murder","kill",
                 "justice","kidnap","robber","shooting"), collapse = "|")
word_bank <- rbind(word_bank,words)
art$title_crime <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
# ethnic titles
words <- paste(c("hausa"), collapse = "|")
word_bank <- rbind(word_bank,words)
art$title_ethnic_hausa <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
words <- paste(c("fula"), collapse = "|")
word_bank <- rbind(word_bank,words)
art$title_ethnic_fulani <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
words <- paste(c("yoruba"), collapse = "|")
word_bank <- rbind(word_bank,words)
art$title_ethnic_yoruba <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
words <- paste(c("igbo"), collapse = "|")
word_bank <- rbind(word_bank,words)
art$title_ethnic_igbo <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
words <- paste(c("ijaw"), collapse = "|")
word_bank <- rbind(word_bank,words)
art$title_ethnic_ijaw <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
# plocebos (only football added to word bank)
words <- paste(c("football","fifa","world cup","afcon","epl","champions league"), collapse = "|")
word_bank <- rbind(word_bank,words)
art$title_placebo_football <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
words <- paste(c("obama"), collapse = "|")
art$title_placebo_obama <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
words <- paste(c("north korea","kim jong-un","nuclear"), collapse = "|")
art$title_placebo_nk <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
words <- paste(c("eagles"), collapse = "|")
art$title_placebo_eagles <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
# controls
words <- paste(c("Biafra","ipob","massob","nnamdi kanu"), collapse = "|")
word_bank <- rbind(word_bank,words)
art$title_ethnic_conflict_igbo <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
words <- paste(c("herdsmen"), collapse = "|")
word_bank <- rbind(word_bank,words)
art$title_ethnic_conflict_fulani <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
words <- paste(c("boko","haram","chibok"), collapse = "|")
word_bank <- rbind(word_bank,words)
art$title_ethnic_conflict_bk <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
words <- paste(c("delta","pipeline", "avengers"," MEND "), collapse = "|")
word_bank <- rbind(word_bank,words)
art$title_ethnic_conflict_delta <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
words <- paste(c("God","Christian","Jesus","Pastor","Priest","bishop"), collapse = "|")
word_bank <- rbind(word_bank,words)
art$title_relig_christian <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
words <- paste(c("islam","muslim","allah","sharia"), collapse = "|")
word_bank <- rbind(word_bank,words)
art$title_relig_muslim <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0)
words <- paste(c(" road","electric","megawatt","sewage",
                 "hospital","health","education","school"), collapse = "|") 
word_bank <- rbind(word_bank,words)
art$title_pub_services <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
# nation 
words <- paste(c("nigeria","naija"), collapse = "|")
word_bank <- rbind(word_bank,words)
art$title_nigeria <- ifelse(grepl(words, art$ARTICLE_TITLE,ignore.case=TRUE),1,0) 
# combining hausa-fulani 
art$title_ethnic_hf <- ifelse(art$title_ethnic_hausa==1 | art$title_ethnic_fulani==1,1,0)

#### ++ Article Text STM ####

### processing (stemmed)
art.processed_h <- textProcessor(
  documents=art$ARTICLE_TEXT, 
  metadata=art[,-c(4)], 
  removestopwords = TRUE,
  customstopwords=c("said","will","can","news", "vanguard","say","by"),
  lowercase=TRUE,
  stem=TRUE,
  language="en")

### remove infrequent words/doc and attach text with meta data 
art.out_h <- prepDocuments(documents=art.processed_h$documents, 
                           vocab=art.processed_h$vocab, 
                           meta=art.processed_h$meta,
                           lower.thresh=10) # remove words that appear less than
### shortening outputs
docsh=art.out_h$documents; vocabh=art.out_h$vocab; metah=art.out_h$meta
### run STM
art.fit10h <- stm(docsh, vocabh, K=10,
                  init.type="Spectral",
                  max.em.its = 250,
                  data = metah) 

### identifying topic proportions

# extracting topic proportions
prop <- data.frame(art.fit10h$theta)
row_sum <- apply(prop,1,sum);row_sum # summing each row
names(prop) <- c("top1","top2","top3","top4","top5",
                 "top6","top7","top8","top9","top10")

### combining topic proportions to main dataset
nrow(prop); nrow(metah)
a2 <- cbind(metah[,c("THREAD_URL")],prop)
names(a2)[1] <- "THREAD_URL"
art <- merge(art, a2, all.x=T,by="THREAD_URL")
# creating binary topics with 10 percent of topic
threshold <- 0.1
art$top1_bin10 <- ifelse(art$top1>=threshold,1,0)
art$top2_bin10 <- ifelse(art$top2>=threshold,1,0)
art$top3_bin10 <- ifelse(art$top3>=threshold,1,0)
art$top4_bin10 <- ifelse(art$top4>=threshold,1,0)
art$top5_bin10 <- ifelse(art$top5>=threshold,1,0)
art$top6_bin10 <- ifelse(art$top6>=threshold,1,0)
art$top7_bin10 <- ifelse(art$top7>=threshold,1,0)
art$top8_bin10 <- ifelse(art$top8>=threshold,1,0)
art$top9_bin10 <- ifelse(art$top9>=threshold,1,0)
art$top10_bin10 <- ifelse(art$top10>=threshold,1,0)

#### + Comments Data ####

#### ++ Comment Pre-processing ####

## loading data
com <- read.csv("./vangaurd_2016_comments_RAW.csv",
                stringsAsFactors = FALSE)

# removing excess columns
com <- com[,!(names(com) %in% c("eep","cep","nep","coder_note"))]
# changing variables to numeric
com$comment_parent_id <- as.numeric(com$comment_parent_id)
com$thread_id <- as.numeric(com$thread_id)
# removing comments that do not have 2016 in time
com <- com[grep("2016",com$comment_created_at),]
# transforming comment date to postix
comment_created_at <- strptime(com$comment_created_at, format="%Y-%m-%dT%H:%M:%S",tz="GMT")
# dropping comments that don't have URL in THREAD_URL
com <- com[grep("http",com$thread_url),] 
# change thread_URL to caps to match article dataset
colnames(com)[15] <- "THREAD_URL"
# remove comments that are not associated with an article
com <- com[which(com$THREAD_URL %in% art$THREAD_URL),]

#### ++ Coding Ethnic/Religious Comments ####

# ethnic comment
ethnic.words <- c("hausa","fula","yoruba","igbo","ijaw")
ethnic.words_grep <- paste(ethnic.words, collapse = "|"); ethnic.words_grep
com$comment_ethnic <- ifelse(grepl(ethnic.words_grep, com$comment_text,ignore.case=TRUE),1,0)
# hausa
ethnic.words <- c("hausa")
ethnic.words_grep <- paste(ethnic.words, collapse = "|"); ethnic.words_grep
com$comment_ethnic_hausa <- ifelse(grepl(ethnic.words_grep, com$comment_text,ignore.case=TRUE),1,0)
# fulani
ethnic.words <- c("fula")
ethnic.words_grep <- paste(ethnic.words, collapse = "|"); ethnic.words_grep
com$comment_ethnic_fulani <- ifelse(grepl(ethnic.words_grep, com$comment_text,ignore.case=TRUE),1,0)
# yoruba
ethnic.words <- c("yoruba")
ethnic.words_grep <- paste(ethnic.words, collapse = "|"); ethnic.words_grep
com$comment_ethnic_yoruba <- ifelse(grepl(ethnic.words_grep, com$comment_text,ignore.case=TRUE),1,0)
# igbo
ethnic.words <- c("igbo")
ethnic.words_grep <- paste(ethnic.words, collapse = "|"); ethnic.words_grep
com$comment_ethnic_igbo <- ifelse(grepl(ethnic.words_grep, com$comment_text,ignore.case=TRUE),1,0)
# ijaw
ethnic.words <- c("ijaw")
ethnic.words_grep <- paste(ethnic.words, collapse = "|"); ethnic.words_grep
com$comment_ethnic_ijaw <- ifelse(grepl(ethnic.words_grep, com$comment_text,ignore.case=TRUE),1,0)
# christian
ethnic.words <- c("christ")
ethnic.words_grep <- paste(ethnic.words, collapse = "|"); ethnic.words_grep
com$comment_relig_christ <- ifelse(grepl(ethnic.words_grep, com$comment_text,ignore.case=TRUE),1,0)
# muslim 
ethnic.words <- c("muslim","islam")
ethnic.words_grep <- paste(ethnic.words, collapse = "|"); ethnic.words_grep
com$comment_relig_muslim <- ifelse(grepl(ethnic.words_grep, com$comment_text,ignore.case=TRUE),1,0)
# nigeria
ethnic.words <- c("nigeria","naija")
ethnic.words_grep <- paste(ethnic.words, collapse = "|"); ethnic.words_grep
com$comment_nigeria <- ifelse(grepl(ethnic.words_grep, com$comment_text,ignore.case=TRUE),1,0)
# ccombining  hausa-fulani
com$comment_ethnic_hf <- ifelse((com$comment_ethnic_hausa+com$comment_ethnic_fulani)>=1,1,0) 

#### + Authors Data ####

#### ++ Authors Pre-processing ####

# load data
auth <- read.csv("./vangaurd_2016_authors_RAW.csv",
                 stringsAsFactors = FALSE); names(auth)

# removing excess columsn
auth <- auth[,c(1:13)];names(auth)
# converting to numeric
auth$author_id <- as.numeric(auth$author_id)
auth$author_is_private <- as.numeric(auth$author_is_private)
auth$author_is_anonymous <- as.numeric(auth$author_is_anonymous)
auth$author_is_poweruser <- as.numeric(auth$author_is_poweruser)
auth$author_reputation <- as.numeric(auth$author_reputation)
auth$author_joined_at <- strptime(auth$author_joined_at, format="%Y-%m-%dT%H:%M:%S",tz="GMT")

#### + Merging Data ####

#### ++ Merging Authors to Comment Dataset ####

com <- merge(com, auth, all.x=T, by="author_id")

# converting "com" to "d" given legacy use of d
d <- com

#### ++ Merging Comment to Articles ####

# total comments per article
art <- merge(art, aggregate(comment_ethnic ~ THREAD_URL, com, FUN="sum"), all.x=T,by="THREAD_URL")
art <- merge(art, aggregate(comment_ethnic_yoruba ~ THREAD_URL, com, FUN="sum"), all.x=T,by="THREAD_URL")
art <- merge(art, aggregate(comment_ethnic_igbo ~ THREAD_URL, com, FUN="sum"),all.x=T,by="THREAD_URL")
art <- merge(art, aggregate(comment_ethnic_hausa ~ THREAD_URL, com, FUN="sum"),all.x=T,by="THREAD_URL")
art <- merge(art, aggregate(comment_ethnic_fulani ~ THREAD_URL, com, FUN="sum"),all.x=T,by="THREAD_URL")
art <- merge(art, aggregate(comment_ethnic_ijaw ~ THREAD_URL, com, FUN="sum"),all.x=T,by="THREAD_URL")
art <- merge(art, aggregate(comment_relig_christ ~ THREAD_URL, com, FUN="sum"),all.x=T,by="THREAD_URL")
art <- merge(art, aggregate(comment_relig_muslim ~ THREAD_URL, com, FUN="sum"),all.x=T,by="THREAD_URL")
art <- merge(art, aggregate(comment_ethnic_hf ~ THREAD_URL, com, FUN="sum"),all.x=T,by="THREAD_URL")
art <- merge(art, aggregate(comment_nigeria ~ THREAD_URL, com, FUN="sum"),all.x=T,by="THREAD_URL")
# adding 0 if no comments
art[is.na(art)] <- 0
# capturing frequency in the comments for each article before making binary
art$comment_ethnic_freq <- art$comment_ethnic
art$comment_ethnic_yoruba_freq <- art$comment_ethnic_yoruba
art$comment_ethnic_igbo_freq <- art$comment_ethnic_igbo
art$comment_ethnic_hausa_freq <- art$comment_ethnic_hausa
art$comment_ethnic_fulani_freq <- art$comment_ethnic_fulani
art$comment_ethnic_ijaw_freq <- art$comment_ethnic_ijaw
art$comment_relig_christ_freq <- art$comment_relig_christ
art$comment_relig_muslim_freq <- art$comment_relig_muslim
art$comment_ethnic_hf_freq <- art$comment_ethnic_hf
art$comment_nigeria_freq <- art$comment_nigeria
# creating binary for at least one comment
art$comment_ethnic <- ifelse(art$comment_ethnic>=1, 1, 0)
art$comment_ethnic_yoruba <- ifelse(art$comment_ethnic_yoruba>=1, 1, 0)
art$comment_ethnic_igbo <- ifelse(art$comment_ethnic_igbo>=1, 1, 0)
art$comment_ethnic_hausa <- ifelse(art$comment_ethnic_hausa>=1, 1, 0)
art$comment_ethnic_fulani <- ifelse(art$comment_ethnic_fulani>=1, 1, 0)
art$comment_ethnic_ijaw <- ifelse(art$comment_ethnic_ijaw>=1, 1, 0)
art$comment_relig_christ <- ifelse(art$comment_relig_christ>=1, 1, 0)
art$comment_relig_muslim <- ifelse(art$comment_relig_muslim>=1, 1, 0)
art$comment_ethnic_hf <- ifelse(art$comment_ethnic_hf>=1, 1, 0)
art$comment_nigeria <- ifelse(art$comment_nigeria>=1, 1, 0)

# adding 0 if no comments
art[is.na(art)] <- 0

#### ++ Merging Articles to Comments ####

com <- merge(com, art[,c(1,which(!names(art) %in% names(com)))], by="THREAD_URL")

#### + Afrobarometer Data ####

# Please download Round 6 of the Nigeria Afrobarometer: https://afrobarometer.org/data/nigeria-round-6-data-2015
# n <- read.spss("./nig_r6_data_2015.sav", to.data.frame=TRUE)

### How often do you get news from the following sources: Internet?
n$Q12D <- as.character(n$Q12D)
n2 <- n[which(n$Q12D!="Refused" &
                n$Q12D!="Don't know" &
                n$Q12D!="Missing"), ]
n2$news_internet <- c()
n2$news_internet_num <- c()
for(i in 1:nrow(n2)){
  if(n2$Q12D[i]=="Every day"){n2$news_internet[i] <- "Daily"}
  if(n2$Q12D[i]=="A few times a week"){n2$news_internet[i] <- "Weekly"}
  if(n2$Q12D[i]=="A few times a month"){n2$news_internet[i] <- "Monthly"}
  if(n2$Q12D[i]=="Less than once a month"){n2$news_internet[i] <- "<Monthly"}
  if(n2$Q12D[i]=="Never"){n2$news_internet[i] <- "Never"}
}
n2$news_internet <- factor(n2$news_internet,levels=c("Daily","Weekly","Monthly","<Monthly","Never"))

### Q97: What is your highest level of education?
n2$education <- c()
for(i in 1:nrow(n2)){ 
  if(n2$Q97[i]=="Post-secondary qualifications, other than university"){n2$education[i] <- "University"}
  if(n2$Q97[i]=="Some university"){n2$education[i] <- "University"}
  if(n2$Q97[i]=="University completed"){n2$education[i] <- "University"}
  if(n2$Q97[i]=="Secondary school / high school completed"){n2$education[i] <- "Secondary"}
  if(n2$Q97[i]=="Some secondary school / high school"){n2$education[i] <- "Secondary"}
  if(n2$Q97[i]=="Some primary schooling"){n2$education[i] <- "Primary"}
  if(n2$Q97[i]=="Post-graduate"){n2$education[i] <- "University"}
  if(n2$Q97[i]=="Informal schooling only"){n2$education[i] <- "None or Informal"}
  if(n2$Q97[i]=="No formal schooling"){n2$education[i] <- "None or Informal"}
}
n2$education <- factor(n2$education,levels = c("University","Secondary","Primary","None or Informal"))

### Q95 Do you have a job that pays a cash income? If yes, is it full-time or part-time? If no, are you presently looking for a job?
n2$employment <- c()
for(i in 1:nrow(n2)){ 
  if(n2$Q95[i]=="No (not looking)"){n2$employment[i] <- "Unemployed"}
  if(n2$Q95[i]=="No (looking)"){n2$employment[i] <- "Unemployed"}
  if(n2$Q95[i]=="Yes, part time"){n2$employment[i] <- "Employed"}
  if(n2$Q95[i]=="Yes, full time"){n2$employment[i] <- "Employed"}
}
n2$employment <- factor(n2$employment)

### Q14. Discuss politics with friends/family
n2$politics <- c()
for(i in 1:nrow(n2)){ 
  if(n2$Q14[i]=="Frequently"){n2$politics[i] <- "Frequently"}
  if(n2$Q14[i]=="Occasionally"){n2$politics[i] <- "Occasionally"}
  if(n2$Q14[i]=="Never"){n2$politics[i] <- "Never"}
}
n2$politics <- factor(n2$politics, levels=c("Frequently","Occasionally","Never"))

### Q87: What is your ethnic community, cultural group or tribe?
n2$ethnicity <- c()
for(i in 1:nrow(n2)){ 
  if(n2$Q87[i]=="Hausa"){n2$ethnicity[i] <- "Hausa-Fulani"}
  else if(n2$Q87[i]=="Fulani"){n2$ethnicity[i] <- "Hausa-Fulani"}
  else if(n2$Q87[i]=="Yoruba"){n2$ethnicity[i] <- "Yoruba"}
  else if(n2$Q87[i]=="Igbo"){n2$ethnicity[i] <- "Igbo"}
  else if(n2$Q87[i]=="Ijaw"){n2$ethnicity[i] <- "Ijaw"}
  else{n2$ethnicity[i] <- "Other"}
}
n2$ethnicity <- factor(n2$ethnicity, levels=c("Hausa-Fulani","Yoruba","Igbo","Ijaw","Other"))

#### + Saving Datasets ####

save(d, # comments dataset
     art, # article dataset
     n2, # afrobarometer dataset
     auth, # author dataset
     word_bank, # keywords used to code the headline variables
     art.fit10h, # article STM
     file="./nigeria_data.RData"
)
